In [43]:
import numpy as np
import pandas as pa
import matplotlib.pyplot as plt
In [44]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int,
'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float,
'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int,
'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
regressionDir = '/home/weenkus/workspace/Machine Learning - University of Washington/Regression'
house = pa.read_csv(regressionDir + '/datasets/kc_house_data.csv', dtype = dtype_dict)
house_test = pa.read_csv(regressionDir + '/datasets/kc_house_test_data.csv', dtype = dtype_dict)
house_train = pa.read_csv(regressionDir + '/datasets/kc_house_train_data.csv', dtype = dtype_dict)
In [45]:
house.head()
Out[45]:
In [46]:
# Show plots in jupyter
%matplotlib inline
plt.scatter(house.sqft_living, house.price, alpha=0.5)
plt.ylabel('')
plt.xlabel('price')
plt.show()
In [47]:
def get_numpy_data(data_sframe, features, output):
data_sframe['constant'] = 1 # add a constant column to an SFrame
# prepend variable 'constant' to the features list
features = ['constant'] + features
# select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
features_sframe = data_sframe[features]
# this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
features_matrix = np.matrix(features_sframe)
# assign the column of data_sframe associated with the target to the variable ‘output_sarray’
output_sarray = data_sframe[output]
# this will convert the SArray into a numpy array:
output_array = np.array(output_sarray) # GraphLab Create>= 1.7!!
return(features_matrix, output_array)
In [48]:
def predict_outcome(feature_matrix, weights):
predictions = np.dot(feature_matrix, weights)
return(predictions)
In [49]:
def feature_derivative(errors, feature):
derivative = np.dot(errors, feature) * 2
return(derivative)
In [50]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
converged = False
weights = np.array(initial_weights)
while not converged:
# compute the predictions based on feature_matrix and weights:
predictions = predict_outcome(feature_matrix, weights)
# compute the errors as predictions - output:
errors = predictions - output
gradient_sum_squares = 0 # initialize the gradient
# while not converged, update each weight individually:
for i in range(len(weights)):
# Recall that feature_matrix[:, i] is the feature column associated with weights[i]
# compute the derivative for weight[i]:
derivative = feature_derivative(errors, feature_matrix[:, i])
# add the squared derivative to the gradient magnitude
gradient_sum_squares = gradient_sum_squares + (derivative ** 2)
# update the weight based on step size and derivative:
weights[i] = weights[i] - (step_size * derivative)
gradient_magnitude = np.sqrt(gradient_sum_squares)
if gradient_magnitude < tolerance:
converged = True
return(weights)
In [51]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(house_train, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7
In [52]:
simple_weights_train = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size, tolerance)
In [53]:
print ('Weights: ', simple_weights_train)
In [54]:
(simple_feature_matrix, output) = get_numpy_data(house_test, simple_features, my_output)
simple_weights_test = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size, tolerance)
In [55]:
print ('Weights: ', simple_weights_test)
In [57]:
RSS_test = (output - np.dot(simple_feature_matrix, simple_weights_test))
print (RSS_test)
In [ ]:
print (np.dot(simple_feature_matrix, simple_weights_test))
In [ ]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(house_train, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9
In [ ]:
weights_train = regression_gradient_descent(feature_matrix, output,initial_weights, step_size, tolerance)
In [ ]:
print ('Weights: ', weights_train)
In [ ]:
(feature_matrix, output) = get_numpy_data(house_test, model_features, my_output)
np.dot(feature_matrix, (weights_train)
In [ ]:
print (house_test['price'][0])
In [ ]:
RSS_test = (output - np.dot(feature_matrix, np.array(weights_train)[0]))
print (RSS_test)
In [ ]:
In [ ]: